%% Order the proteins in the dataset of De Godoy et al. (2008)
clear all; close all; clc;

% Load Excel file
data = readtable('DeGodoy_Table6_LysC_proteins.xls');

% Define sectors
Upt = []; Glc = []; Resp = []; Ferm = []; Treh = []; ESnk = []; Grwt = []; Struc = [];
UGlc = []; LGlc = [];

oxphor = [];

% Define Glc sector for later categorization
Glc = [];

% Extra sector to store proteins that have not been appointed to a sector
Rest = [];

% Order data
for i = 1:size(data,1)
    % First filter out some proteins based on standard names
    if contains(data{i,2}, 'HXK') == 1 || contains(data{i,2}, 'GLK') == 1 ...
            || contains(data{i,2}, 'HXT') == 1
        Upt = [Upt; data(i,:)];
    elseif contains(data{i,2}, 'PGI') == 1 || contains(data{i,2}, 'FBP') == 1 ...
            || contains(data{i,2}, 'PFK') == 1
        UGlc = [UGlc; data(i,:)];
    elseif contains(data{i,2}, 'FBA') == 1 || contains(data{i,2}, 'TPI') == 1 ...
            || contains(data{i,2}, 'TDH') == 1 || contains(data{i,2}, 'PGK') == 1 ...
            || contains(data{i,2}, 'GPM') == 1 || contains(data{i,2}, 'ENO') == 1 ...
            || contains(data{i,2}, 'PYK') == 1 || contains(data{i,2}, 'PCK') == 1
        LGlc = [LGlc; data(i,:)];
    elseif contains(data{i,2}, 'PDC') == 1 || contains(data{i,2}, 'ADH') == 1 ...
        Ferm = [Ferm; data(i,:)];
    elseif contains(data{i,2}, 'GPD') == 1 || contains(data{i,2}, 'RHR') == 1 ...
            || contains(data{i,2}, 'HOR') == 1
        ESnk = [ESnk; data(i,:)];
    elseif contains(data{i,2}, 'PYC') == 1 || contains(data{i,2}, 'PDA') == 1 ...
            || contains(data{i,2}, 'PDB') == 1 || contains(data{i,2}, 'PDX') == 1 ...
            || contains(data{i,2}, 'LPD') == 1 || contains(data{i,2}, 'CIT') == 1 ...
            || contains(data{i,2}, 'MDH') == 1 || contains(data{i,2}, 'ACO') == 1 ...
            || contains(data{i,2}, 'MLS') == 1 || contains(data{i,2}, 'ICL') == 1 ...
            || contains(data{i,2}, 'IDH') == 1 || contains(data{i,2}, 'IDP') == 1 ...
            || contains(data{i,2}, 'SDH') == 1 || contains(data{i,2}, 'LCS') == 1 ...
            || contains(data{i,2}, 'FUM') == 1 || contains(data{i,2}, 'KGD') == 1 ...
            || contains(data{i,2}, 'LPD') == 1 || contains(data{i,2}, 'LAT') == 1 ...
            || contains(data{i,2}, 'ALD') == 1 || contains(data{i,2}, 'ACS') == 1 
        Resp = [Resp; data(i,:)];   
    elseif contains(data{i,2}, 'PGM') == 1 || contains(data{i,2}, 'TPS') == 1 ...
            || contains(data{i,2}, 'GSY') == 1 || contains(data{i,2}, 'NTH') == 1 ...
            || contains(data{i,2}, 'ATH') == 1 || contains(data{i,2}, 'GPH') == 1 ...
        Treh = [Treh; data(i,:)];
    elseif contains(data{i,2}, 'HSP') == 1
        Struc = [Struc; data(i,:)];
        
    % Sort the remaining proteins based on the KEGG Pathway names
    elseif strcmp(data{i,22}, 'Oxidative phosphorylation') == 1 ...
            || contains(data{i,22}, 'TCA cycle') == 1 ...
        Resp = [Resp; data(i,:)];
            if strcmp(data{i,22}, 'Oxidative phosphorylation') == 1 
                oxphor = [oxphor; data(i,:)];
            end
    elseif contains(data{i,22}, 'Biosynthesis') == 1 ...
            || contains(data{i,22}, 'biosynthesis') == 1 ...
            || contains(data{i,22}, 'transcription') == 1 ...
            || contains(data{i,22}, 'Cell cycle') == 1 ...
            || contains(data{i,22}, 'DNA') == 1 ...
            || contains(data{i,22}, 'metabolism') == 1 ...
            || contains(data{i,22}, 'Pentose phosphate pathway') == 1 ...
            || contains(data{i,22}, 'Ribosome') == 1 ...
            || contains(data{i,22}, 'RNA') == 1 ...
            || contains(data{i,22}, 'SNARE') == 1 ...
            || contains(data{i,22}, 'Proteasome') == 1 ...
        Grwt = [Grwt; data(i,:)];
    elseif contains(data{i,22}, 'signaling') == 1 ...
        Struc = [Struc; data(i,:)];
    
    % Sort the remaining proteins based on the GOMF & GOMB names
    elseif contains(data{i,18}, 'RNA') == 1 ...
        || contains(data{i,18}, 'DNA') == 1 ...
        || contains(data{i,16}, 'cytoskeleton') == 1 ...
        || contains(data{i,18}, 'cytoskeleton') == 1 ...
        || contains(data{i,16}, 'structural') == 1 ...
        || contains(data{i,18}, 'cell wall') == 1 ...
        || contains(data{i,18}, 'biosynthesis') == 1 ...
        || contains(data{i,18}, 'biosynthetic') == 1 ...
        || contains(data{i,18}, 'translation') == 1 ...
        || contains(data{i,18}, 'transcription') == 1 ...
        || contains(data{i,18}, 'biogenesis') == 1 ...
        || contains(data{i,18}, 'reproduction') == 1 ...
        || contains(data{i,18}, 'assembly') == 1 ...
        || contains(data{i,18}, 'Golgi') == 1 ...
        Grwt = [Grwt; data(i,:)];
    elseif contains(data{i,18}, 'stress') == 1 ...
        || contains(data{i,18}, 'maintenance') == 1 ...
        Struc = [Struc; data(i,:)];
    else
        Rest = [Rest; data(i,:)];
        Grwt = [Grwt; data(i,:)];

    end
end

%% Calculate proteome allocation & make proteomap

% Store measured intensities
Intensity = [{table2array(rmmissing(Upt(:,37)))}; 
        {table2array(rmmissing(UGlc(:,37)))};
        {table2array(rmmissing(LGlc(:,37)))};
        {table2array(rmmissing(Ferm(:,37)))};
        {table2array(rmmissing(ESnk(:,37)))};
        {table2array(rmmissing(Resp(:,37)))};
        {table2array(rmmissing(Treh(:,37)))};
        {table2array(rmmissing(Grwt(:,37)))};
        {table2array(rmmissing(Struc(:,37)))}];

% Calculate total intensities per sector   
sign_tot = 0;
for i = 1:9
    sign_sec(i) = sum(Intensity{i});
    sign_tot = sign_tot + sum(Intensity{i});
end

% Calculate proteome allocation
% Upt - UGlc - LGlc - Resp - Ferm - Treh - ESnk - Grwt - Struc
phi = sign_sec./sign_tot

% Make a proteomap
figure(1)
names = {'Upt', 'Uglc', 'Lglc', 'Ferm', 'ESnk', 'Resp' 'Treh', 'Grwt', 'Struc'};
textscale = 0.3;
Proteomap(phi,names,textscale)
% title('Batch (De Godoy et al., 2008)', 'FontSize', 20)